library

library(ggplot2)
library(ISLR)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(splines)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode

Use the non-linear fitting techniques to fit flexible models. Create plots of the results obtained and write (explain) summary of your findings.

Wage data set -> dependent variable : wage

data(Wage)

goal)다른 변수들을 이용해서 wage를 예측하는 것을 목표로 한다.

EDA

head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins      logwage           wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
##                                      Median :4.653   Median :104.92  
##                                      Mean   :4.654   Mean   :111.70  
##                                      3rd Qu.:4.857   3rd Qu.:128.68  
##                                      Max.   :5.763   Max.   :318.34  
## 
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

observations : 3000
variables : 11

1.year :임금 정보가 기록된 연도
2.age :노동자의 나이
3.maritl :{1:결혼안함, 2:기혼, 3:사별, 4:이혼, 5:별거} -> 노동자의 결혼 상태
4.race :{1:백인, 2:흑인, 3:아시안, 4:기타} -> 노동자의 인종
5.education :{1:고졸미만, 2:고졸, 3:일부 대학, 4:대학 졸업, 5:대졸 이상} -> 교육 수준
6.region :지역
7.jobclass :{1:산업, 2:정보} -> 직업 분류
8.health :{1:좋음 이하, 2:매우 좋음 이상} -> 노동자의 건강 상태
9.health_ins :{1:있음, 2:없음} -> 건강 보험 여부
10.logwage : 로그 스케일 임금
11.wage : 임금

해당 데이터셋에서 wage를 반응 변수로 선정했고, 나머지 변수들을 예측변수의 후보로 선정하였습니다.
[선형성, 등분산성, 정규성]에 대한 검증을 진행하였습니다.

wage_model = lm(wage ~ year + age + maritl + race + education + jobclass + health + health_ins, data = Wage)
summary(wage_model)
## 
## Call:
## lm(formula = wage ~ year + age + maritl + race + education + 
##     jobclass + health + health_ins, data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100.33  -18.70   -3.26   13.29  212.79 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -2.423e+03  6.165e+02  -3.931 8.67e-05 ***
## year                         1.241e+00  3.074e-01   4.037 5.54e-05 ***
## age                          2.707e-01  6.223e-02   4.350 1.41e-05 ***
## maritl2. Married             1.718e+01  1.720e+00   9.985  < 2e-16 ***
## maritl3. Widowed             2.052e+00  8.005e+00   0.256  0.79774    
## maritl4. Divorced            3.967e+00  2.887e+00   1.374  0.16951    
## maritl5. Separated           1.153e+01  4.844e+00   2.380  0.01736 *  
## race2. Black                -5.096e+00  2.146e+00  -2.375  0.01760 *  
## race3. Asian                -2.814e+00  2.603e+00  -1.081  0.27978    
## race4. Other                -6.059e+00  5.666e+00  -1.069  0.28505    
## education2. HS Grad          7.759e+00  2.369e+00   3.275  0.00107 ** 
## education3. Some College     1.834e+01  2.520e+00   7.278 4.32e-13 ***
## education4. College Grad     3.124e+01  2.548e+00  12.259  < 2e-16 ***
## education5. Advanced Degree  5.395e+01  2.811e+00  19.190  < 2e-16 ***
## jobclass2. Information       3.571e+00  1.324e+00   2.697  0.00704 ** 
## health2. >=Very Good         6.515e+00  1.421e+00   4.585 4.72e-06 ***
## health_ins2. No             -1.751e+01  1.403e+00 -12.479  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34 on 2983 degrees of freedom
## Multiple R-squared:  0.3396, Adjusted R-squared:  0.3361 
## F-statistic: 95.89 on 16 and 2983 DF,  p-value: < 2.2e-16
# Linearity
plot(wage_model$fitted.values, Wage$wage,
     xlab = "Fitted values",
     ylab = "Actual values",
     main = "Linerity:Fitted vs Actual",
     abline(0, 1, col = "red")
     )

#Homoscedasticity
plot(wage_model$fitted.values, residuals(wage_model),
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")

#Normality
qqPlot(wage_model, main = "Q-Q Plot of Residuals")

## 154582 233833 
##    504   2713
  1. Linearity(선형성) 해당 데이터는 몇몇 outlier를 제외하면, fitted value와 actual value 사이에 선형적인 관계가 나타난다는 사실을 확인할 수 있었습니다.

2.Homoscedasticity(등분산성) residual plot을 통해서 확인해 본 결과, 잔차에서 특정한 패턴이 보이진 않았습니다. 선형 회귀 모델이 데이터를 잘 설명하고 있다고 할 수 있습니다.

  1. Normality(정규성) Q-Q plot을 그려본 결과, residual의 분포가 right skew 되어 있다는 것을 알 수 있습니다. 정규 분포의 형태를 띄지 않는 것으로 보아 해당 데이터셋에는 linear model이 적절하지 않다는 결론을 내릴 수 있습니다.

결론) 해당 데이터에는 선형모델이 적합하지 않다고 결론을 내립니다.

# 1. correlation
num_vars <- Wage %>% select(year, age, maritl, race, education, jobclass, health, health_ins, wage)
num_vars$maritl <- as.numeric(num_vars$maritl) - 1 
num_vars$race <- as.numeric(num_vars$race) - 1 
num_vars$education <- as.numeric(num_vars$education) - 1 
num_vars$jobclass <- as.numeric(num_vars$jobclass) - 1  
num_vars$health <- as.numeric(num_vars$health) - 1  
num_vars$health_ins <- as.numeric(num_vars$health_ins) - 1  

cor_matrix <- cor(num_vars, use="complete.obs") #remove missing value
corrplot(cor_matrix, method = "number")

wage와 다른 변수들간의 상관관계를 확인한 결과, education이 0.48로 가장 높은 수치를 보였다.

fit the model

set.seed(123)
Wage = na.omit(Wage)
#train : test = 0.8 : 0.2
trainIndex <- createDataPartition(Wage$wage, p = 0.8, list = FALSE)
trainset_wage <- Wage[trainIndex, ]
testset_wage <- Wage[-trainIndex, ]
Polynomial model : wage ~ education
ctrl = trainControl(method = "cv", number = 10)  # 10-fold cross-validation
max_degree = 10
cv_errors = rep(NA, max_degree)

model = train(wage ~ poly(education, 1, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[1] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 2, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[2] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 3, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[3] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 4, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[4] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 5, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[5] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 6, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[6] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 7, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[7] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 8, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[8] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 9, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[9] = mean(model$results$RMSE^2)

model = train(wage ~ poly(education, 10, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[10] = mean(model$results$RMSE^2)
# 교차 검증 오차를 그래프로 시각화
plot(1:max_degree, cv_errors, type = "b", 
     xlab = "Degree of Polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs. Polynomial Degree")

Polynomial model : wage ~ age
ctrl = trainControl(method = "cv", number = 10)  # 10-fold cross-validation
max_degree = 10
cv_errors = rep(NA, max_degree)

model = train(wage ~ poly(age, 1, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[1] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 2, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[2] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 3, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[3] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 4, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[4] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 5, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[5] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 6, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[6] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 7, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[7] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 8, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[8] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 9, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[9] = mean(model$results$RMSE^2)

model = train(wage ~ poly(age, 10, raw=TRUE), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors, type = "b", 
     xlab = "Degree of Polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs. Degree of polynomial")

10-fold cross-validation을 wage ~ education, wage ~ age에 대해서 각각 진행한 결과, age를 예측변수로 사용한 쪽이 MSE가 훨씬 안정적으로 수렴하고 있다는 것을 확인할 수 있었다. 따라서, wage를 예측하는 데 있어서 age가 더 적절하다는 판단을 내린다.

Spline model
max_degree = 10
cv_errors_sp = rep(NA, max_degree)

model_spline = train(wage ~ ns(age, df = 1), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[1] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 2), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[2] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 3), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[3] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 4), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[4] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 5), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[5] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 6), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[6] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 7), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[7] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 8), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[8] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 9), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[9] = mean(model_spline$results$RMSE^2)

model_spline = train(wage ~ ns(age, df = 10), data = trainset_wage, method = "lm", trControl = ctrl)
cv_errors_sp[10] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_sp, type = "b", 
     xlab = "Degree of natural spline", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs Degree of natural splne")

#linear model
lm_model = lm(wage ~ age, data = trainset_wage)
compare Model : Polynomial vs Natural spline
poly_model_d2 = lm(wage ~ poly(age, 2, raw=TRUE), data = trainset_wage)
ns_model_d2 = lm(wage ~ ns(age, df = 2), data = trainset_wage)
poly_predict = predict(poly_model_d2, newdata = testset_wage)
ns_predict = predict(ns_model_d2, newdata = testset_wage)
lm_predict = predict(lm_model, newdata = testset_wage)

MSE_poly = mean((testset_wage$wage - poly_predict)^2)
MSE_ns = mean((testset_wage$wage - ns_predict)^2)
MSE_lm = mean((testset_wage$wage - lm_predict)^2)
print(MSE_poly)
## [1] 1694.414
print(MSE_ns)
## [1] 1700.754
print(MSE_lm)
## [1] 1745.15

Polynomial model
degree : 2 MSE : 1694.414

Natural spline
degree : 2 MSE : 1700.754

Linear model MSE : 1745.15

해당 wage 변수를 추정할 때, linear model보다 non-linear model이 더 좋은 성능을 보인다는 사실을 확인했다. 또한, 근소한 차이로 polynomial model이 Natural spline 모델보다 더 높은 성능을 보였다.

Auto data set -> dependent variable : mpg

data(Auto)

goal)다른 변수들을 이용해서 mpg(연비)를 예측하는 것을 목표로 한다.

EDA

head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

observations : 392
variables : 9

1.mpg : 연비(마일 당 갤런)
2.cylinders : 실린더 수
3.displacement: 엔진 배기량
4.horsepower: 엔진의 마력
5.weight: 차의 무게
6.acceleration: 0에서 60마일에 도달하는 시간 (초 단위)
7.year: 자동차 제조 연도
8.origin: {1:미국, 2:유럽, 3:일본} -> 자동차의 제조국
9.name: 자동차 모델 이름

자동차의 제조사 및 소비자 입장에서 mpg가 가장 주요한 변수 중 하나라고 생각했기 때문에, 해당 변수를 반응변수로 선택하였습니다.

auto_model = lm(mpg ~ . -name, data = Auto)
summary(auto_model)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16
# Linearity
plot(auto_model$fitted.values, Auto$mpg,
     xlab = "Fitted values",
     ylab = "Actual values",
     main = "Linerity:Fitted vs Actual",
     abline(0, 1, col = "red")
     )

#Homoscedasticity
plot(auto_model$fitted.values, residuals(auto_model),
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")

#Normality
qqPlot(auto_model, main = "Q-Q Plot of Residuals")

## 323 327 
## 321 325

1.Linearity(선형성) fitted value와 actual value 사이에 선형적인 관계가 나타난다는 사실을 확인할 수 있었습니다.

2.Homoscedasticity(등분산성) residual plot을 통해서 확인해 본 결과, 잔차에서 2차 함수와 같은 패턴을 확인할 수 있었습니다. 선형 회귀 모델이 데이터를 잘 설명하고 있지 않다고 결론을 내릴 수 있었습니다.

3.Normality(정규성) Q-Q plot을 그려본 결과, residual이 정규 분포를 따르지 않았습니다.

결론)해당 데이터셋에는 linear model이 적합하지 않다는 결론을 내릴 수 있었습니다.

# 1. correlation
num_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year, origin)
num_vars$origin <- as.numeric(num_vars$origin) - 1 
 
cor_matrix <- cor(num_vars, use="complete.obs") #remove missing value
corrplot(cor_matrix, method = "number")

mpg는 cylinders, displacement, horsepower, weight과 큰 음의 상관관계를 보이는 것을 확인할 수 있었습니다. 따라서, mpg 예측에 cylinders, displacement, horsepower, weight를 예측변수의 후보로 사용합니다.

Histogram
ggplot(Auto, aes(x = cylinders)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of cylinder in the Auto Dataset",
       x = "cylinders",
       y = "count") +
  theme_minimal()

ggplot(Auto, aes(x = displacement)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of displacement in the Auto Dataset",
       x = "displacement",
       y = "count") +
  theme_minimal()

ggplot(Auto, aes(x = horsepower)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of horsepower in the Auto Dataset",
       x = "horsepower",
       y = "count") +
  theme_minimal()

ggplot(Auto, aes(x = weight)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of weight in the Auto Dataset",
       x = "weight",
       y = "count") +
  theme_minimal()

  1. cylinder : 2.5 ~ 7.5개인 경우가 대부분을 차지하고 있었다.
  2. displacement : 0 ~ 200 사이에 대부분의 데이터가 몰려있는 것을 확인했다. 이외의 구간에서의 데이터의 수가 부족했다.
  3. horsepower : right-skewed distribution의 형태를 띄고 있었다.
  4. weight : 균등분포의 형태를 띄고 있었다.

모델 fitting 시, cylinder와 displacement는 제외하기로 한다.

fit the model : mpg ~ horsepower + weight

set.seed(123)
Auto = na.omit(Auto)
#train : test = 0.8 : 0.2
trainIndex = createDataPartition(Auto$mpg, p = 0.8, list = FALSE)
trainset_auto <- Auto[trainIndex, ]
testset_auto <- Auto[-trainIndex, ]
polynomial model : mpg ~ horsepower + weight
ctrl = trainControl(method = "cv", number = 10)  # 10-fold cross-validation
max_degree = 10
cv_errors_auto = rep(NA, max_degree)

model = train(mpg ~ poly(horsepower, 1, raw=TRUE) + poly(weight, 1, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[1] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 2, raw=TRUE) + poly(weight, 2, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[2] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 3, raw=TRUE) + poly(weight, 3, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[3] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 4, raw=TRUE) + poly(weight, 4, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[4] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 5, raw=TRUE) + poly(weight, 5, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[5] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 6, raw=TRUE) + poly(weight, 6, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[6] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 7, raw=TRUE) + poly(weight, 7, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[7] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 8, raw=TRUE) + poly(weight, 8, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[8] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 9, raw=TRUE) + poly(weight, 9, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[9] = mean(model$results$RMSE^2)

model = train(mpg ~ poly(horsepower, 10, raw=TRUE) + poly(weight, 10, raw = TRUE), data = trainset_auto,
                method = "lm", trControl = ctrl)
cv_errors_auto[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors_auto, type = "b", 
     xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs Degree of polynomial")

##### Spline model

max_degree = 10
cv_errors_auto = rep(NA, max_degree)

model_spline = train(mpg ~ ns(horsepower, df = 1) + ns(weight, df = 1), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[1] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 2) + ns(weight, df = 2), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[2] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 3) + ns(weight, df = 3), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[3] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 4) + ns(weight, df = 4), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[4] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 5) + ns(weight, df = 5), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[5] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 6) + ns(weight, df = 6), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[6] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 7) + ns(weight, df = 7), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[7] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 8) + ns(weight, df = 8), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[8] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 9) + ns(weight, df = 9), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[9] = mean(model_spline$results$RMSE^2)

model_spline = train(mpg ~ ns(horsepower, df = 10) + ns(weight, df = 10), data = trainset_auto, method = "lm", trControl = ctrl)
cv_errors_auto[10] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_auto, type = "b", 
     xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs Degree of natural spline")

polynomial model과 natural spline의 CV MSE를 비교한 결과, 둘다 df가 2일 때부터 유의미한 성능 향상이 나타났다.
따라서, df가 2인 모델을 선정하기로 한다.

compare Model : Polynomial vs Natural spline
poly_model_auto_d2 = lm(mpg ~ poly(weight, 2, raw=TRUE) + poly(horsepower, 2, raw=TRUE), data = trainset_auto)
ns_model_auto_d2 = lm(mpg ~ ns(weight, df = 2) + ns(horsepower, df= 2), data = trainset_auto)
linear_model_auto = lm(mpg ~ weight + horsepower, data = trainset_auto)
poly_predict_auto = predict(poly_model_auto_d2, newdata = testset_auto)
ns_predict_auto = predict(ns_model_auto_d2, newdata = testset_auto)
lm_predict_auto = predict(linear_model_auto, newdata = testset_auto)

MSE_poly_auto = mean((testset_auto$mpg - poly_predict_auto)^2)
MSE_ns_auto = mean((testset_auto$mpg - ns_predict_auto)^2)
MSE_lm_auto = mean((testset_auto$mpg - lm_predict_auto)^2)
print(MSE_poly_auto)
## [1] 13.21094
print(MSE_ns_auto)
## [1] 13.13274
print(MSE_lm_auto)
## [1] 16.47695

Polynomial model
degree : 2 MSE : 13.21094

Natural spline
degree : 2 MSE : 13.13274

Linear model MSE : 16.47695

mpg 변수를 추정할 때, linear model보다 non-linear model이 더 좋은 성능을 보인다는 사실을 알 수 있었습니다. polynomial과 Natural spline의 경우에는 유의미한 성능 차이를 볼 수 없었습니다.

College data set -> dependent variable : Apps

data(College)

EDA

head(College)
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
str(College)
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

observations : 777
variables : 18

1.Private: 사립 대학 여부 (Yes/No)
2.Apps: 접수된 지원서 수
3.Accept: 합격한 학생 수
4.Enroll: 등록한 학생 수
5.Top10perc: 상위 10%의 고등학교 성적을 가진 학생 비율
6.Top25perc: 상위 25%의 고등학교 성적을 가진 학생 비율
7.F.Undergrad: 풀타임 학부생 수
8.P.Undergrad: 파트타임 학부생 수
9.outstate: 외지 학생 등록금 Room.Board: 숙소 및 식사비
10.Books: 교재비
11.Personal: 개인 경비
12.PhD: 박사학위 보유한 교수 비율
13.Terminal: 석사학위 보유한 교수 비율
14.S.F Ratio : 학생/교수 비율
15.perc.alumni: 기부한 동문 비율
16.Expend: 학생 당 교육 비용 Grad.Rate: 졸업생 비율

대학 또는 지원자들 입장에서 Apps가 주요한 관심사 중 하나라고 생각했기 때문에, 해당 변수를 예측변수로 설정하였습니다.

college_model = lm(Apps ~ ., data = College)
summary(college_model)
## 
## Call:
## lm(formula = Apps ~ ., data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4908.8  -430.2   -29.5   322.3  7852.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -445.08413  408.32855  -1.090 0.276053    
## PrivateYes  -494.14897  137.81191  -3.586 0.000358 ***
## Accept         1.58581    0.04074  38.924  < 2e-16 ***
## Enroll        -0.88069    0.18596  -4.736 2.60e-06 ***
## Top10perc     49.92628    5.57824   8.950  < 2e-16 ***
## Top25perc    -14.23448    4.47914  -3.178 0.001543 ** 
## F.Undergrad    0.05739    0.03271   1.754 0.079785 .  
## P.Undergrad    0.04445    0.03214   1.383 0.167114    
## Outstate      -0.08587    0.01906  -4.506 7.64e-06 ***
## Room.Board     0.15103    0.04829   3.127 0.001832 ** 
## Books          0.02090    0.23841   0.088 0.930175    
## Personal       0.03110    0.06308   0.493 0.622060    
## PhD           -8.67850    4.63814  -1.871 0.061714 .  
## Terminal      -3.33066    5.09494  -0.654 0.513492    
## S.F.Ratio     15.38961   13.00622   1.183 0.237081    
## perc.alumni    0.17867    4.10230   0.044 0.965273    
## Expend         0.07790    0.01235   6.308 4.79e-10 ***
## Grad.Rate      8.66763    2.94893   2.939 0.003390 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared:  0.9292, Adjusted R-squared:  0.9276 
## F-statistic: 585.9 on 17 and 759 DF,  p-value: < 2.2e-16
# Linearity
plot(college_model$fitted.values, College$Apps,
     xlab = "Fitted values",
     ylab = "Actual values",
     main = "Linerity:Fitted vs Actual",
     abline(0, 1, col = "red")
     )

#Homoscedasticity
plot(college_model$fitted.values, residuals(college_model),
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")

#Normality
qqPlot(college_model, main = "Q-Q Plot of Residuals")

##     Princeton University Rutgers at New Brunswick 
##                      460                      484

1.Linearity(선형성)
fitted value와 actual value 사이에 선형적인 관계가 나타난다는 사실을 확인할 수 있었습니다. 그러나, 데이터가 왼쪽으로 편향되어 있었습니다.

2.Homoscedasticity(등분산성)
residual plot을 확인한 결과, residual의 특정한 패턴은 확인할 수 없었습니다.

3.Normality(정규성)
Q-Q plot을 그려본 결과, residual이 정규분포를 따르지 않는다는 사실을 확인할 수 있었습니다.

결론)
해당 데이터셋에는 선형 모델이 적합하지 않다는 결론을 내릴 수 있었습니다.

num_vars <- College %>% select(Enroll, Outstate, Grad.Rate, PhD, S.F.Ratio, Apps, Accept, Private, Expend)
num_vars$Private <- as.numeric(num_vars$Private) - 1  # Convert variable to numeric type

cor_matrix <- cor(num_vars, use="complete.obs")
corrplot(cor_matrix, method = "number")

Apps은 Enroll, Accept과의 큰 양의 상관관계를 보이는 것을 확인할 수 있습니다.

histogram
ggplot(College, aes(x = Top10perc)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of Enroll in the College Dataset",
       x = "Top10perc",
       y = "count") +
  theme_minimal()

ggplot(College, aes(x = Outstate)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of Enroll in the College Dataset",
       x = "Top10perc",
       y = "count") +
  theme_minimal()

ggplot(College, aes(x = Expend)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of Enroll in the College Dataset",
       x = "Expend",
       y = "count") +
  theme_minimal()

ggplot(College, aes(x = Enroll)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of Enroll in the College Dataset",
       x = "Enroll",
       y = "count") +
  theme_minimal()

ggplot(College, aes(x = Accept)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Histogram of Accept in the College Dataset",
       x = "Accept",
       y = "count") +
  theme_minimal()

  1. Enroll : right-skewed distribution을 따른다.
  2. Accept : right-skewed distribution을 따른다.
  3. Top10perc : right-skewed distribution을 따른다.

3개의 예측변수 모두 right-skewed distribution을 따르지만, Top10prec가 가장 명확한 right-skewed distribution을 따르고 있다는 사실을 확인할 수 있었다.
따라서, 분포의 형태가 가장 명확인 Top10perc를 Apps의 예측변수로 사용하기로 한다.

fit the model : Apps ~ Top10perc

set.seed(123)
Auto = na.omit(College)
#train : test = 0.8 : 0.2
trainIndex = createDataPartition(College$Apps, p = 0.8, list = FALSE)
trainset_college <- College[trainIndex, ]
testset_college <- College[-trainIndex, ]
ctrl = trainControl(method = "cv", number = 10)  # 10-fold cross-validation
max_degree = 10
cv_errors_college = rep(NA, max_degree)
model = train(Apps ~ poly(Top10perc, degree=1, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[1] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=2, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[2] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=3, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[3] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=4, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[4] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=5, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[5] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=6, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[6] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=7, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[7] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=8, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[8] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=9, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[9] = mean(model$results$RMSE^2)

model = train(Apps ~ poly(Top10perc, degree=10, raw = TRUE), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[10] = mean(model$results$RMSE^2)
plot(1:max_degree, cv_errors_college, type = "b", 
     xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs Degree of polynomial")

Spline model
max_degree = 7
cv_errors_college = rep(NA, max_degree)

model_spline = train(Apps ~ ns(Top10perc, df = 1), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[1] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 2), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[2] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 3), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[3] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 4), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[4] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 5), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[5] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 6), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[6] = mean(model_spline$results$RMSE^2)

model_spline = train(Apps ~ ns(Top10perc, df = 7), data = trainset_college, method = "lm", trControl = ctrl)
cv_errors_college[7] = mean(model_spline$results$RMSE^2)
plot(1:max_degree, cv_errors_college, type = "b", 
     xlab = "Degree of polynomial", ylab = "Cross-Validation Error",
     main = "Cross-Validation Error vs Degree of spline")

polynomial model과 spline model 둘 다 df가 4일 때, 가장 낮은 MSE를 보인다.
polynomial model 과 spline model의 df = 4 을 기준으로 모델을 fitting 하기로 한다.

compare Model : Polynomial vs Natural spline
poly_model_college_d1 = lm(Apps ~ poly(Top10perc, 4, raw=TRUE), data = trainset_college)
ns_model_college_d1 = lm(Apps ~ ns(Top10perc, df = 4), data = trainset_college)
linear_model_college = lm(Apps ~ Top10perc, data = trainset_college)
poly_predict_college = predict(poly_model_college_d1, newdata = testset_college)
ns_predict_college = predict(ns_model_college_d1, newdata = testset_college)
lm_predict_college = predict(linear_model_college, newdata = testset_college)

MSE_poly_college = mean((testset_college$Apps - poly_predict_college)^2)
MSE_ns_college = mean((testset_college$Apps - ns_predict_college)^2)
MSE_lm_college = mean((testset_college$Apps - lm_predict_college)^2)
print(MSE_poly_college)
## [1] 5944005
print(MSE_ns_college)
## [1] 5970815
print(MSE_lm_college)
## [1] 6385897

Polynomial model
degree : 4 MSE : 5944005

Natural spline
degree : 4 MSE : 5970815

Linear model MSE : 6385897

Apps 변수를 추정할 때, linear model보다 non-linear model이 약간 더 좋은 성능을 보이는 것을 확인할 수 있었습니다.
polynomial과 Natural spline의 경우에는 유의미한 성능 차이를 볼 수 없었습니다.